In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import gensim
import sklearn
"""
from gensim import corpora, models, similarities
from gensim.utils import smart_open, simple_preprocess
from gensim.matutils import corpus2csc
from gensim.parsing.preprocessing import STOPWORDS
"""
import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.gensim
In [2]:
sns.set_context("poster")
sns.set_style("ticks")
In [3]:
print "Numpy version: ", np.__version__
print "Pandas version: ", pd.__version__
print "Matplotlib version: ", plt.matplotlib.__version__
print "Seaborn version: ", sns.__version__
print "Sklearn version: ", sklearn.__version__
print "NLTK version: ", nltk.__version__
print "Gensim version: ", gensim.__version__
print "PyLDAvis version: ", pyLDAvis.__version__
More details at: http://www.nltk.org/
In [4]:
nltk.download("movie_reviews")
nltk.download("punkt")
Out[4]:
In [5]:
from nltk.corpus import movie_reviews
In [6]:
movie_reviews.categories()
Out[6]:
In [7]:
movie_reviews.words()
Out[7]:
In [8]:
sents = movie_reviews.sents()
In [9]:
len(sents)
Out[9]:
In [10]:
sents[0]
Out[10]:
In [11]:
for i, s in enumerate(sents[:10]):
print "S[%s]:\t%s" % (i, " ".join(s))
More details at: https://radimrehurek.com/gensim/
In [12]:
bigrams = gensim.models.Phrases(sents[:100])
In [13]:
bigrams.vocab.items()[0:10]
Out[13]:
In [14]:
sorted(bigrams.vocab.iteritems(), key=lambda x: x[1], reverse=True)[:10]
Out[14]:
In [15]:
word_frequencies = map(lambda x: x[1], bigrams.vocab.iteritems())
In [16]:
plt.hist(word_frequencies, bins=range(0,70000, 1000), log=True)
plt.xscale("symlog")
In [17]:
sorted(filter(lambda x: isinstance(x[0], str) and "_" in x[0],
bigrams.vocab.iteritems()), key=lambda x: x[1], reverse=True)[:30]
Out[17]:
In [18]:
corpus = bigrams[sents[:100]]
id2word = gensim.corpora.Dictionary(corpus)
In [19]:
len(id2word.keys())
Out[19]:
In [20]:
corpus_processed = [id2word.doc2bow(k) for k in corpus]
print len(corpus_processed)
In [21]:
corpus_processed[0]
Out[21]:
In [22]:
corpus[0]
Out[22]:
In [23]:
LDA_model = gensim.models.ldamodel.LdaModel(corpus_processed, num_topics=10, id2word=id2word)
In [24]:
LDA_model.print_topics()
Out[24]:
In [25]:
LDA_model.get_document_topics(corpus_processed[0])
Out[25]:
In [26]:
doc_topics = LDA_model[corpus_processed]
In [27]:
doc_topics[1]
Out[27]:
In [28]:
pyLDAvis.gensim.prepare(LDA_model, corpus_processed,
id2word)
Out[28]:
In [29]:
nltk.download("wordnet")
Out[29]:
In [30]:
from nltk.corpus import wordnet as wn
In [31]:
wn.synsets('dog')
Out[31]:
In [32]:
nltk.download("sentiwordnet")
Out[32]:
In [33]:
from nltk.corpus import sentiwordnet as swn
In [34]:
breakdown = swn.senti_synset('breakdown.n.03')
print breakdown
In [35]:
nltk.download("averaged_perceptron_tagger")
Out[35]:
In [36]:
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)
Out[36]:
In [37]:
nltk.download(["maxent_ne_chunker", "words", "stopwords"])
Out[37]:
In [38]:
text = nltk.word_tokenize("US president Barack Obama signed a new treaty with the Indian prime minister Narendra Modi, in New Delhi.")
pos_tags = nltk.pos_tag(text)
print pos_tags
In [39]:
try:
chunk_tags = nltk.ne_chunk(pos_tags, binary=False)
except:
print "Done"
print chunk_tags
In [ ]: